{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "%load_ext autoreload\n",
    "%autoreload 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import sys\n",
    "sys.path.append(\"/home/argenisleon/Optimus/\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "from optimus import Optimus"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/argenisleon/.conda/envs/rapids_blazing/lib/python3.7/site-packages/numba/cuda/envvars.py:17: NumbaWarning: \u001b[1m\n",
      "Environment variables with the 'NUMBAPRO' prefix are deprecated and consequently ignored, found use of NUMBAPRO_NVVM=/usr/local/cuda/nvvm/lib64/libnvvm.so.\n",
      "\n",
      "For more information about alternatives visit: ('http://numba.pydata.org/numba-doc/latest/cuda/overview.html', '#cudatoolkit-lookup')\u001b[0m\n",
      "  warnings.warn(errors.NumbaWarning(msg))\n",
      "/home/argenisleon/.conda/envs/rapids_blazing/lib/python3.7/site-packages/numba/cuda/envvars.py:17: NumbaWarning: \u001b[1m\n",
      "Environment variables with the 'NUMBAPRO' prefix are deprecated and consequently ignored, found use of NUMBAPRO_LIBDEVICE=/usr/local/cuda/nvvm/libdevice.\n",
      "\n",
      "For more information about alternatives visit: ('http://numba.pydata.org/numba-doc/latest/cuda/overview.html', '#cudatoolkit-lookup')\u001b[0m\n",
      "  warnings.warn(errors.NumbaWarning(msg))\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "Open Bumblebee: <a target='_blank' href ='https://app.hi-bumblebee.com/?session=734a9a33-84ee-48ec-bd39-8c1c2ac58fd0&key=kMC74PEkDa_tSxj-KMz1_g7LUArgv2u2Ws79w-mcMDE=&view=0'>https://app.hi-bumblebee.com</a><div>If you really care about privacy get your keys in bumblebee.ini and put them<a href ='https://app.hi-bumblebee.com'> here</a></div>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:Just check that Spark and all necessary environments vars are present...\n",
      "INFO:-----\n",
      "INFO:Starting or setting Dask Client...\n",
      "INFO:Config.ini not found\n"
     ]
    }
   ],
   "source": [
    "op= Optimus(\"dask-cudf\", n_workers=4, threads_per_worker=2,\n",
    "                                          processes=False, memory_limit=\"3G\", comm=True, verbose= True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "latin1\n"
     ]
    }
   ],
   "source": [
    "df = op.load.csv(\"data/crime.csv\", sep=\",\", header=True, infer_schema='false', null_value=\"None\", charset=\"latin1\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = df.repartition(npartitions=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = df.persist()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "INCIDENT_NUMBER         object\n",
       "OFFENSE_CODE             int64\n",
       "OFFENSE_CODE_GROUP      object\n",
       "OFFENSE_DESCRIPTION     object\n",
       "DISTRICT                object\n",
       "REPORTING_AREA         float64\n",
       "SHOOTING                object\n",
       "OCCURRED_ON_DATE        object\n",
       "YEAR                     int64\n",
       "MONTH                    int64\n",
       "DAY_OF_WEEK             object\n",
       "HOUR                     int64\n",
       "UCR_PART                object\n",
       "STREET                  object\n",
       "Lat                    float64\n",
       "Long                   float64\n",
       "Location                object\n",
       "dtype: object"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.dtypes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "num_cols = [\"OFFENSE_CODE\",\"YEAR\", \"MONTH\", \"HOUR\"]\n",
    "# cc= [\"INCIDENT_NUMBER\",\"OFFENSE_CODE_GROUP\",\"OFFENSE_DESCRIPTION\",\"DISTRICT\",\"REPORTING_AREA\",\"REPORTING_AREA\",\"SHOOTING\",\"OCCURRED_ON_DATE\",\"DAY_OF_WEEK\",\"UCR_PART\",\"STREET\",\"Location\"]\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "bins = np.array([-30, 0, 3, 6, 9, 10, 19, 50])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Using digitize"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 140 ms, sys: 36 ms, total: 176 ms\n",
      "Wall time: 1.77 s\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "((8    319073\n",
       "  dtype: int32, array([-30,   0,   3,   6,   9,  10,  19,  50])), (8    319073\n",
       "  dtype: int32, array([-30,   0,   3,   6,   9,  10,  19,  50])), (2    45263\n",
       "  3    74431\n",
       "  4    99947\n",
       "  5    26543\n",
       "  6    72889\n",
       "  dtype: int32, array([-30,   0,   3,   6,   9,  10,  19,  50])), (2     32068\n",
       "  3     11308\n",
       "  4     26919\n",
       "  5     14740\n",
       "  6    162967\n",
       "  7     71071\n",
       "  dtype: int32, array([-30,   0,   3,   6,   9,  10,  19,  50])))"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import dask\n",
    "import cudf\n",
    "\n",
    "# def histogram(ser, bins):\n",
    "#     print(type(ser))\n",
    "#     binned = ser.digitize(bins, right=False)\n",
    "#     vc = binned.value_counts().sort_index()\n",
    "#     return vc, cudf.Series(bins)\n",
    "\n",
    "def hist_serie(serie, buckets):    \n",
    "    def func(_serie):\n",
    "        binned = _serie.digitize(bins, right=False)\n",
    "        vc = binned.value_counts().sort_index()\n",
    "        return vc, bins\n",
    "            \n",
    "    return dask.delayed(func)(serie)\n",
    "\n",
    "delayed_tasks =[]\n",
    "for num_col in num_cols:\n",
    "    delayed_tasks.append(hist_serie(df[num_col],10))\n",
    "%time dask.compute(*delayed_tasks)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Using Cupy"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "import cupy as cp\n",
    "import dask\n",
    "\n",
    "\n",
    "def hist_serie(serie, buckets):    \n",
    "    def func(_serie):\n",
    "        arr = cp.fromDlpack(_serie.to_dlpack())\n",
    "        return cp.histogram(arr, buckets)\n",
    "            \n",
    "    return dask.delayed(func)(serie)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "((array([12600, 65983, 10820, 18794, 16752,  3015, 23563, 32552, 88665,\n",
      "       46329]), array([ 111.,  483.,  855., 1227., 1599., 1971., 2343., 2715., 3087.,\n",
      "       3459., 3831.])), (array([ 53388,      0,      0,  99114,      0,      0, 100886,      0,\n",
      "            0,  65685]), array([2015. , 2015.3, 2015.6, 2015.9, 2016.2, 2016.5, 2016.8, 2017.1,\n",
      "       2017.4, 2017.7, 2018. ])), (array([45263, 24146, 24086, 26199, 30568, 34556, 34823, 26543, 25737,\n",
      "       47152]), array([ 1. ,  2.1,  3.2,  4.3,  5.4,  6.5,  7.6,  8.7,  9.8, 10.9, 12. ])), (array([32068,  7997,  8337, 36633, 32795, 35525, 53582, 41065, 33438,\n",
      "       37633]), array([ 0. ,  2.3,  4.6,  6.9,  9.2, 11.5, 13.8, 16.1, 18.4, 20.7, 23. ])))\n",
      "CPU times: user 20 ms, sys: 0 ns, total: 20 ms\n",
      "Wall time: 43 ms\n"
     ]
    }
   ],
   "source": [
    "delayed_tasks =[]\n",
    "for num_col in num_cols:\n",
    "    delayed_tasks.append(hist_serie(df[num_col],10))\n",
    "%time print(dask.compute(*delayed_tasks))\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Using Optimus"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 28 ms, sys: 0 ns, total: 28 ms\n",
      "Wall time: 55.3 ms\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "{'OFFENSE_CODE': {'hist': [{'count': 169.0, 'lower': 111.0, 'upper': 297.0},\n",
       "   {'count': 12431.0, 'lower': 297.0, 'upper': 483.0},\n",
       "   {'count': 44188.0, 'lower': 483.0, 'upper': 669.0},\n",
       "   {'count': 21795.0, 'lower': 669.0, 'upper': 855.0},\n",
       "   {'count': 1548.0, 'lower': 855.0, 'upper': 1041.0},\n",
       "   {'count': 9272.0, 'lower': 1041.0, 'upper': 1227.0},\n",
       "   {'count': 16609.0, 'lower': 1227.0, 'upper': 1413.0},\n",
       "   {'count': 2185.0, 'lower': 1413.0, 'upper': 1599.0},\n",
       "   {'count': 216.0, 'lower': 1599.0, 'upper': 1785.0},\n",
       "   {'count': 16536.0, 'lower': 1785.0, 'upper': 1971.0},\n",
       "   {'count': 2759.0, 'lower': 1971.0, 'upper': 2157.0},\n",
       "   {'count': 256.0, 'lower': 2157.0, 'upper': 2343.0},\n",
       "   {'count': 2655.0, 'lower': 2343.0, 'upper': 2529.0},\n",
       "   {'count': 20908.0, 'lower': 2529.0, 'upper': 2715.0},\n",
       "   {'count': 2894.0, 'lower': 2715.0, 'upper': 2901.0},\n",
       "   {'count': 29658.0, 'lower': 2901.0, 'upper': 3087.0},\n",
       "   {'count': 63012.0, 'lower': 3087.0, 'upper': 3273.0},\n",
       "   {'count': 25653.0, 'lower': 3273.0, 'upper': 3459.0},\n",
       "   {'count': 9197.0, 'lower': 3459.0, 'upper': 3645.0},\n",
       "   {'count': 37132.0, 'lower': 3645.0, 'upper': 3831.0}]},\n",
       " 'YEAR': {'hist': [{'count': 53388.0, 'lower': 2015.0, 'upper': 2015.15},\n",
       "   {'count': 0.0, 'lower': 2015.15, 'upper': 2015.3},\n",
       "   {'count': 0.0, 'lower': 2015.3, 'upper': 2015.45},\n",
       "   {'count': 0.0, 'lower': 2015.45, 'upper': 2015.6},\n",
       "   {'count': 0.0, 'lower': 2015.6, 'upper': 2015.75},\n",
       "   {'count': 0.0, 'lower': 2015.75, 'upper': 2015.9},\n",
       "   {'count': 99114.0, 'lower': 2015.9, 'upper': 2016.05},\n",
       "   {'count': 0.0, 'lower': 2016.05, 'upper': 2016.2},\n",
       "   {'count': 0.0, 'lower': 2016.2, 'upper': 2016.35},\n",
       "   {'count': 0.0, 'lower': 2016.35, 'upper': 2016.5},\n",
       "   {'count': 0.0, 'lower': 2016.5, 'upper': 2016.65},\n",
       "   {'count': 0.0, 'lower': 2016.65, 'upper': 2016.8},\n",
       "   {'count': 0.0, 'lower': 2016.8, 'upper': 2016.95},\n",
       "   {'count': 100886.0, 'lower': 2016.95, 'upper': 2017.1},\n",
       "   {'count': 0.0, 'lower': 2017.1, 'upper': 2017.25},\n",
       "   {'count': 0.0, 'lower': 2017.25, 'upper': 2017.4},\n",
       "   {'count': 0.0, 'lower': 2017.4, 'upper': 2017.55},\n",
       "   {'count': 0.0, 'lower': 2017.55, 'upper': 2017.7},\n",
       "   {'count': 0.0, 'lower': 2017.7, 'upper': 2017.85},\n",
       "   {'count': 65685.0, 'lower': 2017.85, 'upper': 2018.0}]},\n",
       " 'MONTH': {'hist': [{'count': 23610.0, 'lower': 1.0, 'upper': 1.55},\n",
       "   {'count': 21653.0, 'lower': 1.55, 'upper': 2.1},\n",
       "   {'count': 0.0, 'lower': 2.1, 'upper': 2.6500000000000004},\n",
       "   {'count': 24146.0, 'lower': 2.6500000000000004, 'upper': 3.2},\n",
       "   {'count': 0.0, 'lower': 3.2, 'upper': 3.75},\n",
       "   {'count': 24086.0, 'lower': 3.75, 'upper': 4.300000000000001},\n",
       "   {'count': 0.0, 'lower': 4.300000000000001, 'upper': 4.8500000000000005},\n",
       "   {'count': 26199.0, 'lower': 4.8500000000000005, 'upper': 5.4},\n",
       "   {'count': 0.0, 'lower': 5.4, 'upper': 5.95},\n",
       "   {'count': 30568.0, 'lower': 5.95, 'upper': 6.5},\n",
       "   {'count': 34556.0, 'lower': 6.5, 'upper': 7.050000000000001},\n",
       "   {'count': 0.0, 'lower': 7.050000000000001, 'upper': 7.6000000000000005},\n",
       "   {'count': 34823.0, 'lower': 7.6000000000000005, 'upper': 8.15},\n",
       "   {'count': 0.0, 'lower': 8.15, 'upper': 8.700000000000001},\n",
       "   {'count': 26543.0, 'lower': 8.700000000000001, 'upper': 9.25},\n",
       "   {'count': 0.0, 'lower': 9.25, 'upper': 9.8},\n",
       "   {'count': 25737.0, 'lower': 9.8, 'upper': 10.350000000000001},\n",
       "   {'count': 0.0, 'lower': 10.350000000000001, 'upper': 10.9},\n",
       "   {'count': 23675.0, 'lower': 10.9, 'upper': 11.450000000000001},\n",
       "   {'count': 23477.0, 'lower': 11.450000000000001, 'upper': 12.0}]},\n",
       " 'HOUR': {'hist': [{'count': 24375.0, 'lower': 0.0, 'upper': 1.15},\n",
       "   {'count': 7693.0, 'lower': 1.15, 'upper': 2.3},\n",
       "   {'count': 4589.0, 'lower': 2.3, 'upper': 3.4499999999999997},\n",
       "   {'count': 3408.0, 'lower': 3.4499999999999997, 'upper': 4.6},\n",
       "   {'count': 3311.0, 'lower': 4.6, 'upper': 5.75},\n",
       "   {'count': 5026.0, 'lower': 5.75, 'upper': 6.8999999999999995},\n",
       "   {'count': 21893.0, 'lower': 6.8999999999999995, 'upper': 8.049999999999999},\n",
       "   {'count': 14740.0, 'lower': 8.049999999999999, 'upper': 9.2},\n",
       "   {'count': 16347.0, 'lower': 9.2, 'upper': 10.35},\n",
       "   {'count': 16448.0, 'lower': 10.35, 'upper': 11.5},\n",
       "   {'count': 18679.0, 'lower': 11.5, 'upper': 12.649999999999999},\n",
       "   {'count': 16846.0,\n",
       "    'lower': 12.649999999999999,\n",
       "    'upper': 13.799999999999999},\n",
       "   {'count': 17189.0, 'lower': 13.799999999999999, 'upper': 14.95},\n",
       "   {'count': 36393.0, 'lower': 14.95, 'upper': 16.099999999999998},\n",
       "   {'count': 20763.0, 'lower': 16.099999999999998, 'upper': 17.25},\n",
       "   {'count': 20302.0, 'lower': 17.25, 'upper': 18.4},\n",
       "   {'count': 17588.0, 'lower': 18.4, 'upper': 19.549999999999997},\n",
       "   {'count': 15850.0, 'lower': 19.549999999999997, 'upper': 20.7},\n",
       "   {'count': 14111.0, 'lower': 20.7, 'upper': 21.849999999999998},\n",
       "   {'count': 23522.0, 'lower': 21.849999999999998, 'upper': 23.0}]}}"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "df.cols.hist(num_cols)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "rapids_blazing",
   "language": "python",
   "name": "rapids_blazing"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
